/*
Content: Merging and cleaning the raw SOEP files
Paper: "Immigrants Return Intentions and Labor Market Behavior when the Home Country is Unsafe"
Authors: Jacopo Bassetto, Teresa Freitas-Monteiro
*/

*******************************************
** MERGE GSOEP DATABASES ******************
*******************************************

use "${orig}/pl.dta", clear // This is the main SOEP file

merge 1:1 pid syear using "${orig}/ppathl.dta", keepusing(pid sex immiyear germborn corigin gebjahr gebmonat migback birthregion arefback parid partner pbleib phrf)
label language
drop if _merge==2
capture drop _merge


merge 1:1 pid syear using "${orig}/pgen.dta"
label language
drop if _merge==2
capture drop _merge


capture drop _merge
merge m:1 hid syear using "${orig}/hbrutto.dta", keepusing(hid syear regtyp bula) 
label language
drop if _merge == 2

capture drop _merge
merge 1:1 pid syear using "${orig}/bioimmig.dta"
label language
drop if _merge==2
capture drop _merge


capture drop _merge
merge m:1 pid using "${orig}/bioparen.dta"
label language
drop if _merge==2
capture drop _merge


capture drop _merge
merge m:1 hid syear using "${orig}/hl.dta"
label language
drop if _merge==2
capture drop _merge


label language EN


* Delete empty variable
foreach var of varlist * {
cap count if `var'>0 & !missing(`var')
if _rc==0 {
if (`r(N)'==0) {
drop `var'
}
}
}

foreach var of varlist * {
capture  replace `var'=. if `var'<0 
}


save "${final}/rawdata_soep.dta", replace

*******************************************
** CREATE VARIABLES ******************
*******************************************

use "${final}/rawdata_soep.dta", clear

* Handle cross sectional variables 
sort pid syear
xtset pid syear 
by pid: gen obs=_n
by pid: egen maxobs=max(obs)

* Identify first observation
gen firstobs= obs if obs==1
 g firstobs_year=syear if firstobs==1
 by pid: egen firstobsyear=max(firstobs_year)
 drop firstobs_year
 
* Identify last observation
svyset  [pweight=phrf]
sort pid syear
by pid: egen help=max(obs)
gen lastobs=1 if obs==help
drop help


* Carry backwards variable 
g yhelp = - syear+2020

sort pid syear

*~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~*
* Household Variables :
*~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~*
* Children Under 16 Yr In Household 
rename hlk0044 child_under16_inHH

* Number of children 
rename hlc0043 numberchildren
replace numberchildren=0 if child_under16_inHH==2 & numberchildren==.
replace numberchildren=0 if pld0173==2 & numberchildren==.
replace numberchildren=pld0172 if pld0172!=. & numberchildren==.
g helper=plj0098+plj0100 if plj0098!=. & plj0100!=.
replace helper=plj0098 if plj0098!=. & plj0100==.
replace helper=plj0100 if plj0100!=. & plj0098==.

replace numberchildren=helper if numberchildren==. & helper!=.
replace numberchildren=hlj0044 if numberchildren==.

* Has at least 1 child
g child=0 if numberchildren == 0 | numberchildren == .
replace child=1 if numberchildre>0 & numberchildre!=.
label var child "Has children"

* Spouse living Abroad? In Germany and Same household?
clonevar spouse_ingermany=birelhs2
replace spouse_ingermany=plj0093 if spouse_ingermany==. & plj0093!=.
clonevar spouse_abroad=birelhsp

*Children in the home country?
clonevar under_age_child_abroad=birelhc2
clonevar children_abroad=birelhc

replace children_abroad=1 if children_abroad==. & plj0097==1
replace children_abroad=1 if children_abroad==. & plj0099==1
replace under_age_child_abroad=1 if under_age_child_abroad==. & plj0097==1


* Other Family in home country 
clonevar parents_abroad=birelhp
rename plj0094 mother_abroad
rename plj0095 father_abroad
replace parents_abroad=1 if mother_abroad==1 & parents_abroad==.
replace parents_abroad=1 if father_abroad==1 & parents_abroad==.
rename plj0101 siblings_abroad // (plj0102 Number Of Siblings Not In Germany)

clonevar brother_sis_abroad=birelhbs
replace brother_sis_abroad=1 if brother_sis_abroad==. & siblings_abroad==1

clonevar family_abroad=birelh
clonevar grandparents_abroad=birelhgp
clonevar distant_rel_abroad=birelhdr
clonevar friends_abroad=birelhfr
clonevar persons_abroad_bringGER=birelhmi

g family_abroad_4=family_abroad
replace family_abroad_4=1 if mother_abroad==1
replace family_abroad_4=1 if father_abroad==1
replace family_abroad_4=1 if siblings_abroad==1
replace family_abroad_4=1 if brother_sis_abroad==1
replace family_abroad_4=1 if grandparents_abroad==1
replace family_abroad_4=1 if  children_abroad==1
replace family_abroad_4=1 if  under_age_child_abroad==1
replace family_abroad_4=1 if  spouse_abroad==1
replace family_abroad_4=2 if (mother_abroad==0 |father_abroad==0 | siblings_abroad==0 | brother_sis_abroad==0 | grandparents_abroad==0 | children_abroad==0 | under_age_child_abroad==0 | spouse_abroad==0) & family_abroad_4==.
replace family_abroad_4=2 if plj0103==1  & family_abroad_4==.
replace family_abroad_4=2 if (birelh==-2 | birelh==1) & family_abroad_4==.

label var family_abroad_4 "Family abroad"

foreach x  in  "family_abroad_4"  {
g `x'_orig=`x'
}

* If missing info 1 or 2 years before/after assume that the situation is the same, given priority of the most recent info (otherwise assume no family abroad)
sort pid syear
foreach x  in   family_abroad_4   {
by pid (syear): g help=`x'[_n+1]
by pid (syear): g help2=`x'[_n+2]
by pid (syear): g help3=`x'[_n-1]
by pid (syear): g help4=`x'[_n-2]

replace `x'=help if `x'==.
replace `x'=help2 if `x'==.
replace `x'=help3 if `x'==.
replace `x'=help4 if `x'==.
cap drop help*
}
label val family_abroad_4 birelh_EN
replace family_abroad_4=2 if family_abroad_4==.


*~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~*
* Individual Characteristics :
*~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~*
* Gender
g female= 0 if sex==1
replace female=1 if sex==2
label var female "Female"

* Marital status: pgfamstd
tab pgfamstd syear if obs==1, m

gen maritstat = 1 if pgfamstd == 1
replace maritstat = 0 if pgfamstd != 1

label def maritstat 0 "Not marrried" 1 "Married"
label val maritstat maritstat
label var maritstat "Marital status"

* Age 
rename gebjahr birthy
gen age=syear-birthy
replace age=. if age>100
label variable age "Age"

gen age_g=1 if age>14 & age<25
replace age_g=2 if age>24 & age<35
replace age_g=3 if age>34 & age<45
replace age_g=4 if age>44 & age<55
replace age_g=5 if age>54 & age!=.

label def ageg 1 "less 24" 2 "25-34" 3 "35-44" 4 "45-54" 5 "more 55", replace
label values age_g ageg
label var age_g "5 year age groups"


* Immigration year
g ysm=syear-immiyear
label var ysm "Years since mig."

cap drop ysm_cat
g ysm_cat=1 if ysm>=0 &  ysm<=4
replace ysm_cat=2 if ysm>=5 &  ysm<=9
replace ysm_cat=3 if ysm>=10 &  ysm<=14
replace ysm_cat=4 if ysm>=15 &  ysm!=.

* Federal state of residence: bula
tab bula, gen(land_d)

* Immigration group 
 g immig_g2=1 if biimgrp==2
	replace immig_g2=1 if biimgrp==4
	replace immig_g2=2 if biimgrp==5
	replace immig_g2=3 if inlist(biimgrp,3,6)
	replace immig_g2=3 if biimgrp==.

* Region of Origin
g region_origin = 0 if corigin==1
replace region_origin = 1 if corigin== 4 | corigin== 5 | corigin== 6 | corigin== 10 | corigin== 11 | corigin== 13 | corigin== 14 | corigin== 15 ///
 | corigin== 17 | corigin== 28 | corigin== 71 | corigin== 117 | corigin== 118
replace region_origin = 2 if corigin==22 | corigin==26 | corigin==31 | corigin== 58 | corigin== 101 | corigin== 103 | corigin== 146 | corigin== 122 | corigin== 123
replace region_origin = 3 if corigin==21 | corigin==29 | corigin==119
replace region_origin = 4 if corigin==32 | corigin==78 | corigin==130 | corigin==141 | corigin==148 | corigin==222 | corigin==73  | corigin==132
replace region_origin = 5 if corigin==3 | corigin==75 | corigin==120 | corigin==121 | corigin==140 | corigin==165 | corigin==168
replace region_origin = 6 if corigin==16  | corigin==18 | corigin==19 | corigin==20 | corigin==23 | corigin==34 |  ///
corigin==39 | corigin==40 | corigin==41 | corigin==55 | corigin==56 | corigin==62 | corigin==70
replace region_origin = 7 if corigin==2 |corigin==24 | corigin==30 | corigin==33 | corigin==46 | corigin==52 | corigin==60 | corigin==81  | corigin==76 | corigin==79 | corigin==90 | corigin==111
replace region_origin = 8 if corigin==36 | corigin==37 | corigin==47 | corigin==49 | corigin==53 | corigin==54 | corigin==57 | corigin==84 | corigin==80 ///
| corigin==86 | corigin==89 | corigin==102 | corigin==110 | corigin==113 | corigin==125 | corigin==142 | corigin==143 | corigin==144 | corigin==156 | corigin==162 | corigin==166 | corigin==170 ///
 | corigin==171 | corigin==173 | corigin==178 | corigin==183
replace region_origin = 9 if corigin==42 | corigin==43 | corigin==50 | corigin==65 | corigin==66 | corigin==85 
replace region_origin = 10 if corigin==74 | corigin==77 | corigin==82 | corigin==155 | corigin==97 
replace region_origin = 11 if corigin>0 & corigin!=. & region_origin ==.

label def region_origin 0 "German" 1 "EU15" 2 "EU enlargement, 2004" 3 "EU enlargement, 2007-2013" 4 "Russia + other former Soviet Union" 5 "Ex-Yugoslavia+2" ///
 6 "Rest of OECD" 7 "Turkey and Arab Countries" 8 "Other African" 9 "South Asia" 10 "Central Asia" 11 "Others"
label value region_origin region_origin
 
 
 g region_origin2=1 if inrange(region_origin,1,3)
 replace region_origin2=2 if inrange(region_origin,4,5)
 replace region_origin2=3 if region_origin== 7
 replace region_origin2=4 if  region_origin!=. & region_origin2==.
label def region_origin2 1 "EU" 2 "Russia + other former Soviet Union + Ex-Yugoslavia" ///
 3 "Turkey + Arab S. Countries" 4 "Others", replace
 label val region_origin2 region_origin2
 
  g non_eu=0 if inrange(region_origin,0,3)
 replace non_eu=1 if !inrange(region_origin,0,3)
 
 drop region_origin
 
*~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~*
* Labour market :
*~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~*

* Employment status
g employ_s2=1 if  plb0022_h==1
replace employ_s2=2 if  plb0022_h==2
replace employ_s2=3 if  (plb0022_h==3| plb0022_h==4 | plb0022_h==5 |  plb0022_h==6 |  plb0022_h==7 |  plb0022_h==8)
replace employ_s2=4 if  plb0022_h==9

label def employ_s2 1 "Full-Time" 2 "Part-Time" 3 "Other" 4 "Not Emp" 5 "M.P. Leave", replace
label val employ_s2 employ_s2
label var employ_s2 "Employment Status"


gen employed = 1 if inlist(employ_s2,1,2)
replace employed = 0 if inlist(employ_s2,3,4)

* Reservation wage
	* CPI: deflator
		gen cpi = . 
		replace cpi =  73.8 if syear == 1992
		replace cpi =  77.1 if syear == 1993
		replace cpi =  79.1 if syear == 1994
		replace cpi =  80.5 if syear == 1995
		replace cpi =  81.6 if syear == 1996
		replace cpi =  83.2 if syear == 1997
		replace cpi =  84.0 if syear == 1998
		replace cpi =  84.5 if syear == 1999
		replace cpi =  85.7 if syear == 2000
		replace cpi =  87.4 if syear == 2001
		replace cpi =  88.6 if syear == 2002
		replace cpi =  89.6 if syear == 2003
		replace cpi =  91.0 if syear == 2004
		replace cpi =  92.5 if syear == 2005
		replace cpi =  93.9 if syear == 2006
		replace cpi =  96.1 if syear == 2007
		replace cpi =  98.6 if syear == 2008
		replace cpi =  98.9 if syear == 2009
		replace cpi = 100.0	if syear == 2010
		replace cpi = 102.1 if syear == 2011
		replace cpi = 104.1 if syear == 2012
		replace cpi = 105.7 if syear == 2013
		replace cpi = 106.6 if syear == 2014

		// I need to add the last ones: 

		replace cpi = 106.6 if syear == 2015
		replace cpi = 106.6 if syear == 2016
		replace cpi = 106.6 if syear == 2017
		replace cpi = 106.6 if syear == 2018

	* Reservation wages: log monthly and hourly 
	rename   plb0420_h 	reswage	// note: there is probably many missings
	
	replace reswage = reswage * 0.51 if plb0420_v1 !=.
	gen reswage_defl = reswage / cpi * 100 
	
	label var reswage "Reservation Wage"
	gen h_reswage_defl = reswage_defl/(plb0422*4) if plb0422 !=. & plb0422 !=0 
	replace h_reswage_defl = reswage_defl/(plb0241_h*4) if plb0241_h !=. & plb0241_h !=0 
	 
	gen has_reswage = 1 if reswage !=. 
	replace has_reswage = 0 if  plb0421 == 1

	gen ln_h_reswage_defl = ln(h_reswage_defl)
	gen ln_reswage_defl = ln(reswage)
	
* JOB SEARCH in the last 4 weeks (1999-2018)
	
		rename   plb0424_v2 jobsearch
		
* Preferred number of hours 
cap drop pref_hours
gen pref_hours = plb0422 if inrange(plb0422,0,100)


* Full-time job search preference
cap drop search_fulltime
gen search_fulltime = 1 if plb0422 >=38 & plb0422 !=.
replace search_fulltime = 0 if plb0422 > 0 & plb0422 < 38
tab plb0422

* Success in Job search
cap drop findfuturejob
recode plb0417_v2 (1 2 3 = 0) (4 = 1), gen(findfuturejob)

* Urgency in Job search
cap drop urgency
gen urgency = 1 if inlist(plb0418,1) | plb0423 == 1
replace urgency = 0 if urgency != 1 &  inlist(plb0418,3,4)
replace urgency = 0 if urgency != 1 &  inlist(plb0423,2)


*~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~*
* Education:
*~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~*
 tab syear pgisced11 // 2010-2018
 tab syear pgisced97 // 1984-2018
g educ_aftm=0 if pgisced97<=2
replace educ_aftm=1 if pgisced97==3
replace educ_aftm=2 if pgisced97>3 & pgisced97!=.

label def educ_aftm 0 "Lower Sec. or below" 1 "Upper Second." 2 "Post-Sec. and Uni", replace
label values educ_aftm educ_aftm


g educ_aftm2=0 if pgisced97<=2
replace educ_aftm2=1 if pgisced97==3
replace educ_aftm2=2 if pgisced97>3 & pgisced97<=5
replace educ_aftm2=3 if pgisced97==6


label def educ_aftm2 0 "Lower Sec. or below" 1 "Upper Second." 2 "Post-Sec. and Short" 3 "Higher Educ", replace
label values educ_aftm2 educ_aftm2

tab educ_aftm2, gen(edu_d)

*~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~*
* Language skills:
*~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~*

* Oral ability in German
tab syear plj0066 // 1985-2005 Own Opinion of Spoken German
tab syear plj0071 // 2007-2018 Oral Ability: German
clonevar oral_german=plj0066
replace oral_german=plj0071 if oral_german==.

cap drop  oral_german_g
g oral_german_g=1 if oral_german>=1 &  oral_german<=2
replace oral_german_g=2 if oral_german>=3 &  oral_german<=5
replace oral_german_g=3 if oral_german==.
label def oral_german_g 1 "Good Oral German" 2 "Not Good Oral Ger." 3 "No Information"
label val oral_german_g oral_german_g

*~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~*
* Attitudes, feelings, preferences:
*~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~*
* World's views
	
	rename   plh0032  worried_econdev
	rename   plh0035  worried_ownhealth
	rename   plh0036 worried_environ
	rename   plh0040  worried_crime
	rename   plj0047 worried_mighate

* Health satisfaction
rename plh0171 health_satisfaction
tab syear health_satisfaction // 1984-2018

* Willingness to take risks 
clonevar willingness_risk=plh0204_h
replace willingness_risk=. if willingness_risk<0

g risk_averse=0 if willingness_risk!=.
replace risk_averse=1 if willingness_risk>=0 & willingness_risk<=3

*~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~*
* Remittances
*~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~*

*Sent Money To Native Country Prev Yr: pld0120

*Parents
* Paid? Payments To Parents,-Inlaw Previous Yr [variable: plj0131]
 label var plj0131 "Payments to parents living abroad"
label var plj0133_h "Parents live in Germany or Abroad"
 
label def location 1 "Germany" 2 "Abroad"
label val plj0133_h location

*Children
* Paid? Payments To Children Previous Yr [variable: plj0135]
 label var plj0135 "Payments to children living abroad"
label var plj0137 "Children lives in Germany or Abroad"
label val plj0137 location

* Spouse
* Paid? Payments To Spouse Previous Yr [variable: plj0139]
 label var plj0139 "Payments to spouse living abroad"
label var plj0142_h "Spouse lives in Germany or Abroad"
label val plj0142_h location
 
* Other relatives
* Paid? Payments To Relatives Previous Yr [variable: plj0143]
label var plj0143 "Payments to relatives living abroad"
label var plj0145_h "Relatives live in Germany or Abroad"
label val plj0145_h location

*Other persons not related
* Paid? Payments To Others Previous Yr [variable: plj0147]
label var plj0147 "Payments to others living abroad"
label var plj0149_h "Unrelated person lives in Germany or Abroad"
label val plj0149_h location

* No payments [variable: plj0151]
foreach var of varlist plj0151 {
capture  replace `var'=. if `var'<0 
}

cap drop send_money_abroad
g send_money_abroad=0 if (plj0131!=. |  plj0135!=. | plj0139!=. | plj0143!=. | plj0147!=. | plj0151!=.)
replace send_money_abroad=1 if   plj0131==1 & (plj0133_h==2) // parents live abroad and send money
replace send_money_abroad=1  if plj0135==1 & plj0137==2 // children 
replace send_money_abroad=1  if plj0139==1 & plj0142_h==2 // spouse, ex 
replace send_money_abroad=1  if plj0143==1 & plj0145_h==2  // other relatives 
replace send_money_abroad=1  if plj0147==1 & plj0149_h==2  // other people 



*~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~*
* Intentions to remain in Germany
*~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~*									  
* Two variables: Harmonization 
* Wish To Remain Germany Permanently[variable: plj0085] 
* BI: Desire To Stay In Germany bistay 
clonevar remain_germany=plj0085_v2
replace remain_germany=plj0085_v1 if remain_germany==.
label var remain_germany "Wish to remain Ger permanently"

recode remain_germany (2=0),  g(remain_ger_per)
replace remain_ger_per=. if remain_ger_per==3
replace remain_ger_per=0 if bistay==1 & remain_ger_per==.
replace remain_ger_per=0 if bistay==2 & remain_ger_per==.
replace remain_ger_per=1 if  bistay==3 & remain_ger_per==.

label def remain_ger 0 "No" 1 "Yes", replace
label values remain_ger_per remain_ger
label var remain_ger_per "Remain GER perman."


*~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~*
* News consumption
*~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~**~*~*~*~*~*~*~*~*	
rename plj0070 newspaper_lang  // 1988-2012 every 2 years Newspaper Germany, Country Of Origin
						
g newspaper_lang_g=1 if inrange(newspaper_lang,1,2) 
replace newspaper_lang_g=2 if newspaper_lang==3
replace newspaper_lang_g=3 if inrange(newspaper_lang,4,5)
replace newspaper_lang_g=4 if newspaper_lang==6
replace newspaper_lang_g=5 if newspaper_lang_g==.


save "${final}/rawdata_soep.dta", replace
